Chapter 10 Structured Corpus
There are a lot of pre-collected corpora available for lingustic studies. This chapter will demonstrate how you can load existing corpora in R and perform basic corpus analysis with these data.
10.1 NCCU Spoken Mandarin
10.1.1 Loading the Corpus
10.1.2 Line Segmentation
10.1.3 Metadata vs. Transcript
NCCU_lines_meta <- NCCU_lines %>%
filter(str_detect(line, "^@"))
NCCU_lines_data <- NCCU_lines %>%
filter(str_detect(line, "^[^@]")) %>%
group_by(doc_id) %>%
mutate(lineID = row_number()) %>%
ungroup %>%
separate(line, into = c("SPID","line"), sep="\t") %>%
mutate(line2 = line %>%
str_replace_all("\\([(\\.)0-9]+?\\)"," <PAUSE> ") %>% # <PAUSE>
str_replace_all("\\&\\=[a-z]+"," <EXTRALING> ") %>% # <EXTRALING>
str_replace_all("[\u2308\u2309\u230a\u230b]"," ") %>% # overlapping talk tags
str_replace_all("@[a-z:]+"," ") %>% # code switching tags
str_replace_all("\\s+"," ") %>% # additional whitespaces
str_trim())
NCCU_lines_data10.1.4 Word Tokenization
NCCU_words <- NCCU_lines_data %>%
select(-line) %>%
unnest_tokens(word, line2, token = function(x) str_split(x, "\\s+")) %>%
filter(word!="")
NCCU_words10.1.5 Word frequencies and Wordcloud
NCCU_words_freq <-NCCU_words %>%
count(word, doc_id) %>%
group_by(word) %>%
summarize(freq = sum(n), dispersion = n()) %>%
arrange(desc(freq), desc(dispersion))
# wordcloud
require(wordcloud2)
NCCU_words_freq %>%
filter(str_detect(word, "^[^<a-z]")) %>%
select(word, freq) %>%
#mutate(freq = log(freq)) %>%
wordcloud2::wordcloud2(minSize = 0.5, size=1, shape="diamonds")
10.1.6 Concordances
10.1.7 Collocations (Bigrams)
# functions from ch Chinese Text Processing
ngram_chi <- function(text, n = 2, delimiter = "_"){
word_vec = strsplit(text, "\\s|\u3000") %>% unlist
if(length(word_vec)>=n){
map2_chr(.x= 1:(length(word_vec)-n+1),
.y = n:length(word_vec),
.f= function(x,y) str_c(word_vec[x:y], collapse=delimiter))
}else{
return("")
}#endif
}#endfunc
NCCU_bigrams <- NCCU_lines_data %>%
select(-line) %>%
unnest_tokens(bigrams, line2, token = function(x) map(x, ngram_chi, n = 2)) %>%
filter(bigrams!="")
NCCU_bigramsNCCU_bigrams_freq <- NCCU_bigrams %>%
count(bigrams, doc_id) %>%
group_by(bigrams) %>%
summarize(freq = sum(n), dispersion = n()) %>%
arrange(desc(freq), desc(dispersion))
NCCU_bigrams_freq %>%
filter(!str_detect(bigrams, "<")) %>%
filter(freq > 5) %>% # set bigram frequency cut-off
rename(O11 = freq) %>%
tidyr::separate(bigrams, c("w1", "w2")) %>% # split bigrams into two columns
mutate(R1 = NCCU_words_freq$freq[match(w1, NCCU_words_freq$word)],
C1 = NCCU_words_freq$freq[match(w2, NCCU_words_freq$word)]) %>% # retrieve w1 w2 unigram freq
mutate(E11 = (R1*C1)/sum(O11)) %>% # compute expected freq of bigrams
mutate(MI = log2(O11/E11), # compute associations
t = (O11 - E11)/sqrt(E11)) -> NCCU_collocations
NCCU_collocations %>%
arrange(desc(dispersion), desc(MI)) # sorting by MI10.1.8 N-grams (Lexical Bundles)
##########################
# Chinse ngrams functin #
##########################
# Generate ngram sequences from `text`
# By default, `text` is assumed to have whitespaces as delimiters between tokens
ngram_chi <- function(text, n = 2, delimiter = "_"){
word_vec = strsplit(text, "\\s") %>% unlist
if(length(word_vec)>=n){
map2_chr(.x= 1:(length(word_vec)-n+1),
.y = n:length(word_vec),
.f= function(x,y) str_c(word_vec[x:y], collapse=delimiter))
}else{
return("")
}#endif
}#endfunc
# Wrapper to Vectorize the function
vngram_chi <- Vectorize(ngram_chi, vectorize.args = "text")
NCCU_ngrams <- NCCU_lines_data %>%
select(-line, -SPID) %>%
unnest_tokens(ngram, line2, token = function(x) vngram_chi(text = x, n = 4, delimiter = "_")) %>%
filter(ngram != "") # remove empty tokens (due to the short lines)
NCCU_ngrams %>%
count(ngram, doc_id) %>%
group_by(ngram) %>%
summarize(freq = sum(n), dispersion = n()) %>%
arrange(desc(dispersion), desc(freq)) %>%
ungroup %>%
filter(!str_detect(ngram,"<")) -> NCCU_ngrams_freq
NCCU_ngrams_freq10.2 Connecting SPID to Metadata
# Self-defined function
fill_spid <- function(vec){
vec_filled <-vec
for(i in 1:length(vec_filled)){
if(vec_filled[i]==""){
vec_filled[i]<-vec_filled[i-1]
}else{
i <- i+1
} #endif
}#endfor
return(vec_filled)
}#endfunc
# Please check M005.cha
NCCU_lines_data %>%
group_by(doc_id) %>%
filter(lineID == 1 & SPID=="")# Remove the typo case
NCCU_lines_data_filled <- NCCU_lines_data %>%
filter(!(doc_id =="M005.cha" & lineID==1)) %>%
group_by(doc_id) %>%
mutate(SPID = str_replace_all(SPID, "[*:]","")) %>%
mutate(SPID_FILLED = fill_spid(SPID)) %>%
mutate(DOC_SPID = str_c(doc_id, SPID_FILLED, sep="_")) %>%
ungroup %>%
select(doc_id, lineID, line2, DOC_SPID)
NCCU_lines_data_filledBased on the metadata of each file hedaer, we can extract demographic information related to each speaker, including their ID, age, gender, etc.
NCCU_meta <- NCCU_lines_meta %>%
filter(str_detect(line, "^@(id)")) %>%
separate(line, into=str_c("V",1:11, sep=""), sep = "\\|") %>%
select(doc_id, V2, V3, V4, V5, V7, V10) %>%
mutate(DOC_SPID = str_c(doc_id, V3, sep="_")) %>%
rename(AGE = V4,
GENDER = V5,
GROUP = V7,
RELATION = V10,
LANG = V2) %>%
select(-V3)
NCCU_meta


